clear all

**set working directory to Data folder
use "retrostudy1 data.dta"


*creating ip status variables
gen nonus=1
replace nonus=0 if countryname=="United States"

gen blockfull=block
replace blockfull=3 if nonus==1 & block==0
gen blockdi=blockfull
recode blockdi 2=0 3=1

label define blocklab 0 "Valid" 1 "VPS User" 2 "VPS Uncertain" 3 "Foreign"
label values blockfull blocklab

gen valid=1
replace valid=0 if countryname!="United States" | block==1
recode valid 0=1 1=0, gen(fraudulent)

*check based on reported age and year born
gen ageborn=7+born
replace age2="" if age2=="37R_2rO9wE6VeOIHzES"
replace age2="30" if age2=="1988"
gen agereport=real(age2)
gen agediff=ageborn-agereport
gen failage=0
replace failage=1 if (agediff<0 | agediff>1)

*check based on reported city
gen failcity=city2
recode failcity 11=0 *=1

*creating attention check index
alpha failcity failage susplocation susppurpose suspcomments, gen(checkindex) item
gen checkindexdi=checkindex
recode checkindexdi .1/1=1
gen check5=checkindex*5

*knowledge
drop pk*DO*
recode pk1 3=1 *=0
recode pk2 1=1 *=0
recode pk3 1=1 *=0
recode pk4 2=1 *=0
alpha pk*, item gen(Know)
gen Know4=Know*4

*time on knowledge - logged
gen ln_t_know=ln(t_know_PageSubmit)

*time without knowledge
gen tminusknow=Durationinsecond-t_know_PageSubmit
gen ln_tminusknow= ln(tminusknow)

*drop at CAPTCHA?
gen capdrop=.
replace capdrop=0 if Q306_ClickCount!=. & Q307_ClickCount!=.
replace capdrop=1 if Q306_ClickCount!=. & Q307_ClickCount==.

*demographics
gen PID = pid
recode PID 2=6 1=2 3/4=4
recode PID 2=1 if pidd==1
recode PID 6=7 if pidr==1
recode PID 4=5 if pidi==2
recode PID 4=3 if pidi==1
gen male = sex
recode male 2/3=0
gen white=race
recode white 2/5=0


****************
* Rep Analysis *
****************

*percent failing at least one check
tab checkindexdi

*ip status
tab blockfull

*data quality by ip status
tab checkindexdi blockfull, col chi2
tab checkindexdi blockfull if blockfull!=2 & blockfull!=3, col chi2
tab checkindexdi blockfull if blockfull!=1 & blockfull!=2, col chi2
tab checkindexdi blockfull if blockfull!=0 & blockfull!=2, col chi2

*figure 2. prevalence of low-quality data by respondent ip type
proportion checkindexdi if blockfull==0
est store m1
proportion checkindexdi if blockfull==1
est store m2
proportion checkindexdi if blockfull==2
est store m3
proportion checkindexdi if blockfull==3
est store m4

cd "../Output Figures"
coefplot m1 m2 m3 m4, keep(1) vert citype(logit) legend(off) recast(bar) barwidth(.1) color(gs10) citop ciopts(lcolor(gs5)) ///
	xlab(.7 "Valid" .9 "VPS User" 1.1 "VPS - Uncertain" 1.3 "Foreign") ///
	ylab(0(.25).75) ytitle("Proportion Flagged for Low-Quality Data") saving(figure2.gph, replace)

*figure a1 - attention by ipvoid scores
proportion checkindexdi if ipvoidus==0 & ipvoidvps==0
est store v1
proportion checkindexdi if ipvoidvps==1
est store v2
proportion checkindexdi if ipvoidus==1 & ipvoidvps==0
est store v3

coefplot v1 v2 v3, keep(1) vert citype(logit) legend(off) recast(bar) barwidth(.1) color(gs10) citop ciopts(lcolor(gs5)) ///
	xlab(.75 `""Foreign=0" "& VPS=0""' 1 "VPS=1" 1.25 `""Foreign=1" "& VPS=0""') ///
	ylab(0(.25).75) ytitle("Proportion Low-Quality Data") subtitle("IP Void") saving(void.gph, replace)

*figure a1 - attention by abuse scores
proportion checkindexdi if abuseipdbus==0 & abuseipdbvps==0
est store a1
proportion checkindexdi if abuseipdbvps==1
est store a2
proportion checkindexdi if abuseipdbus==1 & abuseipdbvps==0
est store a3

coefplot a1 a2 a3, keep(1) vert citype(logit) legend(off) recast(bar) barwidth(.1) color(gs10) citop ciopts(lcolor(gs5)) ///
	xlab(.75 `""Foreign=0" "& VPS=0""' 1 "VPS=1" 1.25 `""Foreign=1" "& VPS=0""') ///
	ylab(0(.25).75) ytitle("Proportion Low-Quality Data") subtitle("IP Abuse") saving(abuse.gph, replace)

*figure a1
graph combine figure2.gph void.gph abuse.gph, saving(figurea1.gph, replace)


*relationship between fraudulent ip status and individual data quality measures
corr fraudulent failcity failage susp*

*appendix a6. individual quality checks
proportion failcity failage susplocation susppurpose suspcomments
corr failcity failage susplocation susppurpose suspcomments

*appendix a6. number of failed checks by iphub scores	
hist check5, by(blockfull, rows(1) note("")  title("Number of Failed Quality Checks by VPS Status (Study 1)")) d xlab(0(1)4) xtitle("Number of Failed Quality Checks") saving(checksbystatus1.gph, replace)


*political knowledge scores by ip status
bysort blockfull: sum Know4
reg Know4 i.blockfull
est store k1
reg Know4 i.blockfull interest educ male white 
est store k2

*time spent on knowledge by ip status
bysort blockfull: sum t_know_PageSubmit
reg ln_t_know i.blockfull
est store k3
reg ln_t_know i.blockfull ln_tminusknow interest educ male white 
est store k4

*table a1
estout k*, cells(b(star fmt(2)) se) stats(N r2) starlevels(+ .10 * .05 ** .01 *** .001)


*ideology analysis
bysort blockfull: pwcorr PID ideo, sig obs
reg PID i.blockfull##c.ideo
est store p1
reg PID i.blockfull##c.ideo interest educ male white
est store p2
estout p*, cells(b(star fmt(2)) se) stats(N r2) starlevels(+ .10 * .05 ** .01 *** .001)

*table a2, column 1
estout p1, cells(b(star fmt(2)) se) stats(N r2) starlevels(+ .10 * .05 ** .01 *** .001)


** re-analysis of experiment **

rename traits*_6 traits*_5
rename traits*_7 traits*_6

*collapsing traits across conditions
forval x=1/6 {
	forval y=1/6 {
	egen traits_v`x'_`y'=rowfirst(traits_v`x'do_`y' traits_v`x'dm_`y' traits_v`x'de_`y' traits_v`x'ro_`y' traits_v`x'rm_`y' traits_v`x're_`y' traits_v`x'c_`y' traits_v`x'cl_`y' traits_v`x'cm_`y' traits_v`x'cc_`y')
	}
}

*creating trait index
forval x=1/6 {
gen traitindex`x'=((traits_v`x'_3 + traits_v`x'_4 + traits_v`x'_5)/3) - ((traits_v`x'_1 + traits_v`x'_2)/2)
}

*experimental conditions
forval x=1/6 {
gen cond`x'=.
replace cond`x'=1 if traits_v`x'c_1!=.
replace cond`x'=2 if traits_v`x're_1!=.
replace cond`x'=3 if traits_v`x'cc_1!=.
replace cond`x'=4 if traits_v`x'ro_1!=.
replace cond`x'=5 if traits_v`x'rm_1!=.
replace cond`x'=6 if traits_v`x'cm_1!=.
replace cond`x'=7 if traits_v`x'dm_1!=.
replace cond`x'=8 if traits_v`x'do_1!=.
replace cond`x'=9 if traits_v`x'cl_1!=.
replace cond`x'=10 if traits_v`x'de_1!=.
}

*reshaping
reshape long traitindex cond, i(ID) j(vignette)

label define condlab 1 "Control" 2 "Consistent Rep." 3 "Consistent Con." 4 "Rep. Only" 5 "Inconsistent Rep." 6 "Inconsistent Only" 7 "Inconsistent Dem." 8 "Dem. Only" 9 "Consistent Lib." 10 "Consistent Dem."
label values cond condlab

xtset ID

*estimating treatment effects among full sample
gen fullsample=.
gen fslo=.
gen fshi=.
xtreg traitindex i.cond i.vignette, fe vce(cluster ID)
est store full
lincom 2.cond
replace fullsample=r(estimate) in 1
replace fslo=r(estimate)-1.96*r(se) in 1
replace fshi=r(estimate)+1.96*r(se) in 1
lincom 3.cond
replace fullsample=r(estimate) in 2
replace fslo=r(estimate)-1.96*r(se) in 2
replace fshi=r(estimate)+1.96*r(se) in 2
lincom 4.cond
replace fullsample=r(estimate) in 3
replace fslo=r(estimate)-1.96*r(se) in 3
replace fshi=r(estimate)+1.96*r(se) in 3
lincom 5.cond
replace fullsample=r(estimate) in 4
replace fslo=r(estimate)-1.96*r(se) in 4
replace fshi=r(estimate)+1.96*r(se) in 4
lincom 6.cond
replace fullsample=r(estimate) in 5
replace fslo=r(estimate)-1.96*r(se) in 5
replace fshi=r(estimate)+1.96*r(se) in 5
lincom 7.cond
replace fullsample=r(estimate) in 6
replace fslo=r(estimate)-1.96*r(se) in 6
replace fshi=r(estimate)+1.96*r(se) in 6
lincom 8.cond
replace fullsample=r(estimate) in 7
replace fslo=r(estimate)-1.96*r(se) in 7
replace fshi=r(estimate)+1.96*r(se) in 7
lincom 9.cond
replace fullsample=r(estimate) in 8
replace fslo=r(estimate)-1.96*r(se) in 8
replace fshi=r(estimate)+1.96*r(se) in 8
lincom 10.cond
replace fullsample=r(estimate) in 9
replace fslo=r(estimate)-1.96*r(se) in 9
replace fshi=r(estimate)+1.96*r(se) in 9

*estimating treatment effects among valid sample
gen validsample=.
gen vslo=.
gen vshi=.
xtreg traitindex i.cond i.vignette if valid==1, fe vce(cluster ID)
est store valid
lincom 2.cond
replace validsample=r(estimate) in 1
replace vslo=r(estimate)-1.96*r(se) in 1
replace vshi=r(estimate)+1.96*r(se) in 1
lincom 3.cond
replace validsample=r(estimate) in 2
replace vslo=r(estimate)-1.96*r(se) in 2
replace vshi=r(estimate)+1.96*r(se) in 2
lincom 4.cond
replace validsample=r(estimate) in 3
replace vslo=r(estimate)-1.96*r(se) in 3
replace vshi=r(estimate)+1.96*r(se) in 3
lincom 5.cond
replace validsample=r(estimate) in 4
replace vslo=r(estimate)-1.96*r(se) in 4
replace vshi=r(estimate)+1.96*r(se) in 4
lincom 6.cond
replace validsample=r(estimate) in 5
replace vslo=r(estimate)-1.96*r(se) in 5
replace vshi=r(estimate)+1.96*r(se) in 5
lincom 7.cond
replace validsample=r(estimate) in 6
replace vslo=r(estimate)-1.96*r(se) in 6
replace vshi=r(estimate)+1.96*r(se) in 6
lincom 8.cond
replace validsample=r(estimate) in 7
replace vslo=r(estimate)-1.96*r(se) in 7
replace vshi=r(estimate)+1.96*r(se) in 7
lincom 9.cond
replace validsample=r(estimate) in 8
replace vslo=r(estimate)-1.96*r(se) in 8
replace vshi=r(estimate)+1.96*r(se) in 8
lincom 10.cond
replace validsample=r(estimate) in 9
replace vslo=r(estimate)-1.96*r(se) in 9
replace vshi=r(estimate)+1.96*r(se) in 9

*estimating treatment effects among fraudulent sample
gen fraudsample=.
gen rslo=.
gen rshi=.
xtreg traitindex i.cond i.vignette if valid==0, fe vce(cluster ID)
est store fraud
lincom 2.cond
replace fraudsample=r(estimate) in 1
replace rslo=r(estimate)-1.96*r(se) in 1
replace rshi=r(estimate)+1.96*r(se) in 1
lincom 3.cond
replace fraudsample=r(estimate) in 2
replace rslo=r(estimate)-1.96*r(se) in 2
replace rshi=r(estimate)+1.96*r(se) in 2
lincom 4.cond
replace fraudsample=r(estimate) in 3
replace rslo=r(estimate)-1.96*r(se) in 3
replace rshi=r(estimate)+1.96*r(se) in 3
lincom 5.cond
replace fraudsample=r(estimate) in 4
replace rslo=r(estimate)-1.96*r(se) in 4
replace rshi=r(estimate)+1.96*r(se) in 4
lincom 6.cond
replace fraudsample=r(estimate) in 5
replace rslo=r(estimate)-1.96*r(se) in 5
replace rshi=r(estimate)+1.96*r(se) in 5
lincom 7.cond
replace fraudsample=r(estimate) in 6
replace rslo=r(estimate)-1.96*r(se) in 6
replace rshi=r(estimate)+1.96*r(se) in 6
lincom 8.cond
replace fraudsample=r(estimate) in 7
replace rslo=r(estimate)-1.96*r(se) in 7
replace rshi=r(estimate)+1.96*r(se) in 7
lincom 9.cond
replace fraudsample=r(estimate) in 8
replace rslo=r(estimate)-1.96*r(se) in 8
replace rshi=r(estimate)+1.96*r(se) in 8
lincom 10.cond
replace fraudsample=r(estimate) in 9
replace rslo=r(estimate)-1.96*r(se) in 9
replace rshi=r(estimate)+1.96*r(se) in 9

*figure 3. comparing treatment effects among valid and fraudulent respondents
twoway scatter fullsample validsample, ylab(-1(.5)1) xlab(-1(.5)1) ///
	ytitle("Treatment Effects Among Full Sample") ///
	xtitle("Treatment Effects Among Valid Sample") ///
	|| rcap vslo vshi fullsample, horiz lcol(gs10%50) ///
	|| rcap fslo fshi validsample, vert lcol(gs10%50) ///
	|| function y=x, range(-1 1) legend(off) ///
	saving(fullvsvalid.gph, replace)
twoway scatter fraudsample validsample, ylab(-1(.5)1) xlab(-1(.5)1) ///
	ytitle("Treatment Effects Among Fraudulent Sample") ///
	xtitle("Treatment Effects Among Valid Sample") ///
	|| function y=x, range(-1 1) legend(off) ///
	|| rcap vslo vshi fraudsample, horiz lcol(gs10%50) ///
	|| rcap rslo rshi validsample, vert lcol(gs10%50) ///
	saving(fraudvsvalid.gph, replace)
graph combine fraudvsvalid.gph fullvsvalid.gph, saving(figure3.gph, replace)

*table a3
estout full valid fraud, cells(b(star fmt(2)) se) stats(N r2) starlevels(+ .10 * .05 ** .01 *** .001)

*comparing treatment effects
reg fullsample validsample
reg fraudsample validsample
